# Replication File for Seljan and Gronke, 2022, "Happy Birthday You Get
# To Vote". 
#
# OREGON: Anonymize 2019 voter file for distribution 
#
#
# This file takes as input a January 2019 voter list file, obtainable
# from the Division of Elections, State of Oregon. This file represents
# the registration system after AVR was implemented, the 2016 
# presidential and the 2018 midterm elections.
# 
# We have not
# distributed the raw data file because it contains personally identifiable
# information for millions of individual registrants. 
#
# The output files contain all data elements necessary to replicate the 
# full analysis.


library(data.table)
library(dplyr)
library(lubridate)
library(mosaic)
library(stringr)
library(AER)
library(broom)
library(tidyverse)
library(tidymodels)
library(stargazer)
library(radiant)
library(openxlsx)
library(wru)
library(gender)
library(genderdata)
library(fastDummies)

options(scipen = 100, digits = 3)


# Oregon 2018 Voter File --------------------------------------------------



reg_files <- list.files(file.path(data_dir, "original_data/Statewide _VoterList_January2019/Registered Voters_January2019"),
                        pattern = ".txt", full.names = TRUE)
read_voter_list <- function(file) {
  read_tsv(file, col_types = cols(ZIP_CODE = col_character(),
                                  SPLIT = col_character())) %>%
    select(VOTER_ID, ends_with("_NAME"), ends_with("_DATE"), CONFIDENTIAL, STATUS, PARTY_CODE,
           COUNTY, ZIP_CODE, ABSENTEE_TYPE, PRECINCT, SPLIT)
}

voter_reg_list <- lapply(reg_files, read_voter_list)

voter_reg_list[[1]]$BIRTH_DATE<-as.numeric(voter_reg_list[[1]]$BIRTH_DATE)
voter_reg_list[[2]]$BIRTH_DATE<-as.numeric(voter_reg_list[[2]]$BIRTH_DATE)
voter_reg_list[[3]]$BIRTH_DATE<-as.numeric(voter_reg_list[[3]]$BIRTH_DATE)
voter_reg_list[[4]]$BIRTH_DATE<-as.numeric(voter_reg_list[[4]]$BIRTH_DATE)
voter_reg_list[[5]]$BIRTH_DATE<-as.numeric(voter_reg_list[[5]]$BIRTH_DATE)

voter_file <- bind_rows(voter_reg_list) %>%
  filter(VOTER_ID != "ACP")


hist_files <- list.files(file.path(data_dir, "original_data/Statewide _VoterList_January2019/Voting History_January2019"),
                         pattern = ".txt", full.names = TRUE)

read_voter_history <- function(file) {
  read_tsv(file) 
}


voter_hist_list <- lapply(hist_files, read_voter_history)

voter_hist_list[[1]]$BIRTH_DATE<-as.numeric(voter_hist_list[[1]]$BIRTH_DATE)
voter_hist_list[[2]]$BIRTH_DATE<-as.numeric(voter_hist_list[[2]]$BIRTH_DATE)
voter_hist_list[[3]]$BIRTH_DATE<-as.numeric(voter_hist_list[[3]]$BIRTH_DATE)
voter_hist_list[[4]]$BIRTH_DATE<-as.numeric(voter_hist_list[[4]]$BIRTH_DATE)
voter_hist_list[[5]]$BIRTH_DATE<-as.numeric(voter_hist_list[[5]]$BIRTH_DATE)

voter_hist_list[[1]]$HOUSE_NUM<-as.numeric(voter_hist_list[[1]]$HOUSE_NUM)
voter_hist_list[[2]]$HOUSE_NUM<-as.numeric(voter_hist_list[[2]]$HOUSE_NUM)
voter_hist_list[[3]]$HOUSE_NUM<-as.numeric(voter_hist_list[[3]]$HOUSE_NUM)
voter_hist_list[[4]]$HOUSE_NUM<-as.numeric(voter_hist_list[[4]]$HOUSE_NUM)
voter_hist_list[[5]]$HOUSE_NUM<-as.numeric(voter_hist_list[[5]]$HOUSE_NUM)

voter_hist_list[[1]]$UNIT_NUM<-as.numeric(voter_hist_list[[1]]$UNIT_NUM)
voter_hist_list[[2]]$UNIT_NUM<-as.numeric(voter_hist_list[[2]]$UNIT_NUM)
voter_hist_list[[3]]$UNIT_NUM<-as.numeric(voter_hist_list[[3]]$UNIT_NUM)
voter_hist_list[[4]]$UNIT_NUM<-as.numeric(voter_hist_list[[4]]$UNIT_NUM)
voter_hist_list[[5]]$UNIT_NUM<-as.numeric(voter_hist_list[[5]]$UNIT_NUM)

voter_hist_list[[1]]$ZIP_CODE<-as.numeric(voter_hist_list[[1]]$ZIP_CODE)
voter_hist_list[[2]]$ZIP_CODE<-as.numeric(voter_hist_list[[2]]$ZIP_CODE)
voter_hist_list[[3]]$ZIP_CODE<-as.numeric(voter_hist_list[[3]]$ZIP_CODE)
voter_hist_list[[4]]$ZIP_CODE<-as.numeric(voter_hist_list[[4]]$ZIP_CODE)
voter_hist_list[[5]]$ZIP_CODE<-as.numeric(voter_hist_list[[5]]$ZIP_CODE)

voter_hist_list[[1]]$SPLIT<-as.numeric(voter_hist_list[[1]]$SPLIT)
voter_hist_list[[2]]$SPLIT<-as.numeric(voter_hist_list[[2]]$SPLIT)
voter_hist_list[[3]]$SPLIT<-as.numeric(voter_hist_list[[3]]$SPLIT)
voter_hist_list[[4]]$SPLIT<-as.numeric(voter_hist_list[[4]]$SPLIT)
voter_hist_list[[5]]$SPLIT<-as.numeric(voter_hist_list[[5]]$SPLIT)


voter_hist_list[[1]]$EFF_ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[1]]$EFF_ZIP_PLUS_FOUR)
voter_hist_list[[2]]$EFF_ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[2]]$EFF_ZIP_PLUS_FOUR)
voter_hist_list[[3]]$EFF_ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[3]]$EFF_ZIP_PLUS_FOUR)
voter_hist_list[[4]]$EFF_ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[4]]$EFF_ZIP_PLUS_FOUR)
voter_hist_list[[5]]$EFF_ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[5]]$EFF_ZIP_PLUS_FOUR)


voter_hist_list[[1]]$ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[1]]$ZIP_PLUS_FOUR)
voter_hist_list[[2]]$ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[2]]$ZIP_PLUS_FOUR)
voter_hist_list[[3]]$ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[3]]$ZIP_PLUS_FOUR)
voter_hist_list[[4]]$ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[4]]$ZIP_PLUS_FOUR)
voter_hist_list[[5]]$ZIP_PLUS_FOUR<-as.numeric(voter_hist_list[[5]]$ZIP_PLUS_FOUR)

voter_hist_list[[1]]$EFF_ZIP_CODE<-as.numeric(voter_hist_list[[1]]$EFF_ZIP_CODE)
voter_hist_list[[2]]$EFF_ZIP_CODE<-as.numeric(voter_hist_list[[2]]$EFF_ZIP_CODE)
voter_hist_list[[3]]$EFF_ZIP_CODE<-as.numeric(voter_hist_list[[3]]$EFF_ZIP_CODE)
voter_hist_list[[4]]$EFF_ZIP_CODE<-as.numeric(voter_hist_list[[4]]$EFF_ZIP_CODE)
voter_hist_list[[5]]$EFF_ZIP_CODE<-as.numeric(voter_hist_list[[5]]$EFF_ZIP_CODE)



voter_history <- bind_rows(voter_hist_list) %>%
  filter(VOTER_ID != "ACP") %>%
  mutate(VOTER_ID = as.numeric(VOTER_ID)) 


OR_2018 <- left_join(voter_file, voter_history, by = "VOTER_ID")



###read in DOB variables from SOS
OR_2018_DOB <- fread(file.path(data_dir, "original_data/OMV_Registration_DOB_2020-01-08.txt"), header=TRUE)

# Merge in DOB data
OR_2018<-left_join(OR_2018, OR_2018_DOB, by ='VOTER_ID')


save(OR_2018, file = file.path(data_dir, "original_data/OR2018_voterfile_merged.RData"))




#Create populous county dummy (Defined as Counties over 150K)
OR_2018<-OR_2018 %>% mutate(populous_county=case_when(COUNTY.x=="MULTNOMAH" ~ 1, 
                                                      COUNTY.x=="WASHINGTON" ~ 1,
                                                      COUNTY.x=="CLACKAMAS" ~ 1,
                                                      COUNTY.x=="LANE" ~ 1,
                                                      COUNTY.x=="MARION" ~ 1,
                                                      COUNTY.x=="JACKSON" ~ 1,
                                                      COUNTY.x=="DESCHUTES" ~ 1,
                                                      TRUE~0))

OR_fips<- read.xlsx(file.path(data_dir, "Supplementary Files/Oregon County FIPS.xlsx"))
OR_2018<-left_join(OR_2018,OR_fips,by="COUNTY.x")
OR_2018$surname<-OR_2018$LAST_NAME.x



library(wru)
OR_census_race<-get_census_data(key="ae147f31d2e5acfa2a6f8cafc8befe65dfd6d462", states="OR", age = FALSE, sex = FALSE,
                                census.geo = "county", retry = 0)
OR_2018$state<-"OR"
OR_2018<-predict_race(voter.file = OR_2018, census.data = OR_census_race, census.geo = "county", census.key ="ae147f31d2e5acfa2a6f8cafc8befe65dfd6d462")


OR_2018$white <-ifelse(OR_2018$pred.whi>OR_2018$pred.bla & OR_2018$pred.whi>OR_2018$pred.his & OR_2018$pred.whi>OR_2018$pred.asi & OR_2018$pred.whi>OR_2018$pred.oth,1,0)
OR_2018$black <-ifelse(OR_2018$pred.bla>OR_2018$pred.whi & OR_2018$pred.bla>OR_2018$pred.his & OR_2018$pred.bla>OR_2018$pred.asi & OR_2018$pred.bla>OR_2018$pred.oth,1,0)
OR_2018$hispanic <-ifelse(OR_2018$pred.his>OR_2018$pred.bla & OR_2018$pred.his>OR_2018$pred.whi & OR_2018$pred.his>OR_2018$pred.asi & OR_2018$pred.his>OR_2018$pred.oth,1,0)
OR_2018$other_race <-ifelse(OR_2018$pred.oth>OR_2018$pred.bla & OR_2018$pred.oth>OR_2018$pred.whi & OR_2018$pred.oth>OR_2018$pred.asi & OR_2018$pred.oth>OR_2018$pred.his,1,0)
OR_2018$asian <-ifelse(OR_2018$pred.asi>OR_2018$pred.bla & OR_2018$pred.asi>OR_2018$pred.whi & OR_2018$pred.asi>OR_2018$pred.his & OR_2018$pred.asi>OR_2018$pred.oth,1,0)

#predict gender
library(gender)
library(genderdata)
gender<-gender_df(OR_2018, name_col = "FIRST_NAME.x", year_col = "birth_year")
gender$gender_female <-ifelse(gender$proportion_female>=0.5,1,0)
gender$birth_year<-gender$year_min
gender$FIRST_NAME.x<-gender$name

gender<-select(gender,-year_min)
gender<-select(gender,-year_max)
gender<-select(gender,-proportion_male)
gender<-select(gender,-proportion_female)
gender<-select(gender,-name)
gender<-select(gender,-gender)

#Limit gendera data for memory
gender<-gender %>%
  filter(birth_year>1905 & birth_year<2000)  


gender<-gender %>% distinct(gender_female, birth_year, FIRST_NAME.x)


OR_2018<-left_join(OR_2018,gender,by= c("FIRST_NAME.x", "birth_year"))
OR_2018<-select(OR_2018,-year_min)
OR_2018<-select(OR_2018,-year_max)
OR_2018<-select(OR_2018,-name)
OR_2018<-select(OR_2018,-proportion_male)


OR_2018$male<-ifelse(OR_2018$gender3==0,1,0)
OR_2018$female<-ifelse(OR_2018$gender3==1,1,0)
OR_2018$nogender<-ifelse(OR_2018$gender3==2,1,0)


OR_2018<-OR_2018 %>% mutate(BSEP25=ifelse(BEFORE_SEP_25=="Y",1, 0))
OR_2018<-OR_2018 %>% mutate(ASEP25=ifelse(AFTER_SEP_25=="Y",1, 0))
OR_2018<-OR_2018 %>% mutate(BNOV6=ifelse(BEFORE_NOV_6=="Y",1, 0))
OR_2018<-OR_2018 %>% mutate(AAUG14=ifelse(AFTER_AUG_14=="Y",1, 0))



OR_2018<-OR_2018 %>%
  rename(general18="11/06/2018")  %>%
  rename(primary18="05/15/2018") %>%
  rename(general16="11/08/2016")


#Reduce number of variables
OR_2018<-select(OR_2018,-c( "05/16/2017",  "05/17/2016", "05/19/2015", "11/04/2014",  "05/20/2014", "11/05/2013", "05/21/2013",  "11/06/2012", "05/15/2012", "01/31/2012",  "11/08/2011", "05/17/2011", "11/02/2010",  "05/18/2010", "01/26/2010","05/19/2009",  "11/04/2008", "05/20/2008") )

#Anonymity
OR_2018<-OR_2018  %>%
  select(BIRTH_DATE.x, EFF_REGN_DATE.x, PARTY_CODE.x, general16, general18, primary18,  lub_regdate,
         BSEP25, ASEP25, BNOV6, AAUG14, populous_county, white, black, hispanic, other_race,
         asian, female, nogender) 

saveRDS(OR_2018, file="OR2018_anon.RDS")


